library(readxl)
library(tidyverse)
library(tibble)
library("gridExtra")
library(ggpubr)
library(maps)
library(colorBlindness)
library(ggpattern)


#Civil War Court Martial Data ----  

counties = read_csv(file.choose()) # read county data from csv

civil_war_data = read_csv(file.choose()) # read civil war soldier data from csv

civil_war_data


# calculate a soldiers nationality by state or by country 
selected_civil_war_data =civil_war_data %>% 
  select(recidnum,r_city01,r_cnty01,r_stat01, rb_stat1,mb_stat,fb_stat,lstyp1_1,milcact1:milcact5) %>%
  mutate(nationality = case_when(
      rb_stat1 %in% state.abb  ~ "America",
      rb_stat1 == "IR" ~ "Ireland",
      rb_stat1 == "GE" ~ "Germany",
      rb_stat1 == "CD" ~ "Canada",
      rb_stat1 == "EN" ~ "England",
      rb_stat1 == "ST" ~ "Scotland",
      rb_stat1 == "FR" ~ "France",
      rb_stat1 == "PU" ~ "Prussia",
      rb_stat1 == "NW" ~ "Norway"
  ))

selected_civil_war_data %>% # show sorted distribution of counties
  group_by(r_cnty01) %>%
  count() %>%
  arrange(-n)


cities = counties %>% # get lat/long data for the "capital" city of each county
  mutate(position = paste(toupper(county),state_id, sep = " ")) %>%
  select(position, lat, lng)

selected_civil_war_data = selected_civil_war_data %>% #get county/state postition of each soldier
  mutate(position = paste(r_cnty01,r_stat01, sep = " "))


# map lat/long data onto each soldier based on their county position
selected_civil_war_data = left_join(selected_civil_war_data,cities, by=c("position")) 


# for each soldier, check over each milcact row for offenses, then assign
# desertion and court martial status
bar_graph_data = selected_civil_war_data %>%
  rename(recidnum...1 = recidnum, milcact1...11=milcact1, milcact2...12=milcact2 ,milcact3...13=milcact3 ,milcact4...14=milcact4, milcact5...15=milcact5) %>%
  select(nationality,lat,lng,lstyp1_1,milcact1...11:milcact5...15,recidnum...1) %>%
  mutate(unit = substr(recidnum...1, 0, 7)) %>%
  mutate(deserter = (milcact1...11 == "D" | milcact2...12 == "D" | milcact3...13== "D" | milcact4...14== "D" | milcact5...15== "D")) %>%
  mutate(deserter = replace_na(deserter,FALSE)) %>%
  mutate(arrested = (milcact1...11 == "A" | milcact2...12 == "A" | milcact3...13== "A" | milcact4...14== "A" | milcact5...15== "A")) %>%
  mutate(arrested = replace_na(arrested,FALSE)) %>%
  mutate(awol = (milcact1...11 == "W" | milcact2...12 == "W" | milcact3...13== "W" | milcact4...14== "W" | milcact5...15== "W")) %>%
  mutate(awol = replace_na(awol,FALSE)) %>%
  mutate(mia = (milcact1...11 == "M" | milcact2...12 == "M" | milcact3...13== "M" | milcact4...14== "M" | milcact5...15== "M")) %>%
  mutate(mia = replace_na(mia,FALSE)) %>%
  mutate(court = deserter | awol | arrested) %>%
  select(recidnum...1,nationality, deserter,court)


  

# Create bar graph showing court-martial and desertion rate from above data
# B/W patterned version
bar_graph_data %>%
  rename("Court-Martial Rate" = court, "Desertion Rate" = deserter,Nationality=nationality) %>%
  pivot_longer("Court-Martial Rate":"Desertion Rate",names_to = "variable", values_to = "value") %>%
  group_by(Nationality,variable) %>%
  summarise(value=mean(value),n=n(),SE = 1/ sqrt(4*n)*100) %>%
  filter(Nationality %in% (c("America","Germany","Ireland"))) %>%
  mutate(pattern_data = case_when(
    Nationality == "America" ~ 0.25,
    Nationality == "Germany" ~ 0.4,
    TRUE                     ~ 0.7
  )) %>%
  ggplot(aes(x=Nationality,fill=variable,y=value*100)) +
  #geom_bar(position='dodge', stat='identity',color="black") +
  #geom_bar(stat='identity',color="black") +
  geom_col_pattern(aes(pattern = as_factor(pattern_data)), 
                   fill  = 'white',colour  = 'black',alpha=.5) +
  labs(x="",y="Rate (%)",title="Court-Martial and Desertion Rates by Nationality") +
  theme_minimal() +
  facet_grid(cols = vars(variable)) +
  geom_errorbar(position=position_dodge(width=0.9),aes(x=Nationality, ymin=value*100-SE, ymax=value*100+SE), width=0.2, size= 1, color="black") +
  theme(legend.position = "none", strip.text.x = element_text(size = 12))


# Create bar graph showing court-martial and desertion rate from above data
# solid color version
bar_graph_data %>%
  rename("Court-Martial Rate" = court, "Desertion Rate" = deserter,Nationality=nationality) %>%
  pivot_longer("Court-Martial Rate":"Desertion Rate",names_to = "variable", values_to = "value") %>%
  group_by(Nationality,variable) %>%
  summarise(value=mean(value),n=n(),SE = 1/ sqrt(4*n)*100) %>%
  filter(Nationality %in% (c("America","Germany","Ireland"))) %>%
  mutate(pattern_data = case_when(
    Nationality == "America" ~ 0.25,
    Nationality == "Germany" ~ 0.4,
    TRUE                     ~ 0.7
  )) %>%
  ggplot(aes(x=Nationality,fill=Nationality,y=value*100)) +
  geom_bar(position='dodge', stat='identity') +
  #geom_col_pattern(aes(pattern = as_factor(pattern_data)), 
  #                 fill  = 'white',colour  = 'black',alpha=.5) +
  labs(x="",y="Rate (%)",title="Court-Martial and Desertion Rates by Nationality") +
  #scale_fill_manual(values = c("white","white","white")) + 
  scale_fill_manual(values = c("#cccccc","#969696","#636363")) +
  theme_minimal() +
  facet_grid(cols = vars(variable)) +
  geom_errorbar(position=position_dodge(width=0.9),aes(x=Nationality, ymin=value*100-SE, ymax=value*100+SE), width=0.2, size= .5, color="black") +
  theme(legend.position = "none", strip.text.x = element_text(size = 12))


# For each soldier, assign court-martial and deserter status
# Then group by unit and average to get court and desertion rates
# Find the unit spread for each unit by calculating the std dev of the distance 
# between each unit and the mean centroid location
# Also calculates the D or S rate (rate of enlistment type)
unit_court_big = selected_civil_war_data %>%
  rename(recidnum...1 = recidnum, milcact1...11=milcact1, milcact2...12=milcact2 ,milcact3...13=milcact3 ,milcact4...14=milcact4, milcact5...15=milcact5) %>%
  select(nationality,lat,lng,lstyp1_1,milcact1...11:milcact5...15,recidnum...1) %>%
  mutate(unit = substr(recidnum...1, 0, 7)) %>%
  mutate(deserter = (milcact1...11 == "D" | milcact2...12 == "D" | milcact3...13== "D" | milcact4...14== "D" | milcact5...15== "D")) %>%
  mutate(deserter = replace_na(deserter,FALSE)) %>%
  mutate(arrested = (milcact1...11 == "A" | milcact2...12 == "A" | milcact3...13== "A" | milcact4...14== "A" | milcact5...15== "A")) %>%
  mutate(arrested = replace_na(arrested,FALSE)) %>%
  mutate(awol = (milcact1...11 == "W" | milcact2...12 == "W" | milcact3...13== "W" | milcact4...14== "W" | milcact5...15== "W")) %>%
  mutate(awol = replace_na(awol,FALSE)) %>%
  mutate(mia = (milcact1...11 == "M" | milcact2...12 == "M" | milcact3...13== "M" | milcact4...14== "M" | milcact5...15== "M")) %>%
  mutate(mia = replace_na(mia,FALSE)) %>%
  mutate(court = deserter | awol | arrested) %>%
  mutate(D = lstyp1_1=='D',S = lstyp1_1=='S',D_or_S = D | S)%>%
  mutate(D = replace_na(D,FALSE),S = replace_na(S,FALSE),D_or_S = replace_na(D_or_S,FALSE)) %>%
  group_by(unit)%>%
  mutate(centroid_lat = median(lat,na.rm = TRUE), centroid_lng = median(lng,na.rm=TRUE))%>%
  mutate(unit_spread = sd (  ((lat-centroid_lat)^2 + (lng-centroid_lng)^2)^(.5),na.rm = TRUE) * 69) %>%
  mutate(desertion_rate = mean(deserter),arrest_rate=mean(arrested),mia_rate=mean(mia),awol_rate=mean(awol),court_rate = mean(court), total_obs = n()) %>%
  mutate(D_rate = mean(D), S_rate = mean(S),D_or_S_rate = mean(D_or_S) , total_obs = n()) %>%
  select(unit,unit_spread,court_rate,desertion_rate,arrest_rate,mia_rate,awol_rate,total_obs,D_rate,S_rate,D_or_S_rate) %>%
  unique() %>%
  arrange(-desertion_rate)

# calculate the demographic rates for each unit
unit_demo_big = selected_civil_war_data %>%
  rename(recidnum...1 = recidnum, milcact1...11=milcact1, milcact2...12=milcact2 ,milcact3...13=milcact3 ,milcact4...14=milcact4, milcact5...15=milcact5) %>%
  select(recidnum...1,nationality) %>%
  mutate(unit = substr(recidnum...1, 0, 7)) %>%
  group_by(unit) %>%
  mutate(size = n()) %>%
  group_by(nationality,unit) %>%
  mutate(nationality_count = n()) %>%
  unique() %>%
  pivot_wider(names_from = nationality, values_from = nationality_count) %>%
  group_by(unit) %>%
  summarise(size,across(England:Norway, ~ mean(.x, na.rm = TRUE))) %>% 
  unique() %>%
  group_by(unit) %>%
  mutate(across(c(England:Norway),.fns = ~./size)) %>%
  mutate_all(~replace(., is.nan(.), 0)) %>%
  arrange(America) %>%
  unique()



# Show unit sizes
unit_demo_big %>%
  select(size) %>%
  ggplot(aes(x=size)) +
  geom_histogram(bins=50)

# Combine the court data with demographic data by unit
comb = inner_join(unit_court_big,unit_demo_big, by=c("unit"))
comb

# Show desertion rate by unit spread (counfounding variable)
comb %>% 
  filter(size > 5) %>%
  ggplot(.,aes(x=unit_spread,y=desertion_rate*100)) +
  geom_point(size=1) + 
  stat_smooth(method = lm,se=FALSE) + 
  #stat_cor(aes(label = paste(..r.label..,..p.label.., sep = "~`,`~")),label.x=60,label.y = 50,size=5,family = "sans",)+
  theme_minimal() +
  theme(
    text = element_text(family = "sans"),
    plot.title = element_text(hjust=.5,size=14),
  )

# get a filtered set for linear model
filtered = comb %>% 
  filter(size > 20) %>%
  mutate(non_America = (1-America))


filtered

# Linear model for court_rate/desertion rate based on demographics, with 
# confounding variables for unit_spread and recuitment type
linear_model = lm(court_rate ~ non_America+D_or_S_rate+unit_spread, data=filtered)
summary(linear_model)

# scatter plot for desertion rate by non-american (stats pre-calculated in the 
# linear model above)
comb %>%
  filter(size > 5) %>%
  ggplot(.,aes(x=(1-America)*100,y=desertion_rate*100)) +
  geom_point(size=1) + 
  stat_smooth(method = lm,se=FALSE) + 
  annotate("text", x = 90, y = 40,size=5, label = "R^2 = 0.24,\np = 3.14e-16") +
  theme_minimal() +
  labs(x="Percent of Company that is non-American",y="Percent of Soldiers Deserted ",
       title="Desertion Rate by Company Diversity")+
  theme(
    text = element_text(family = "sans"),
    plot.title = element_text(hjust=.5,size=14),
  )

# scatter plot for desertion rate by Irish (stats pre-calculated in the 
# linear model above)
comb %>%
  filter(size > 50) %>%
  ggplot(.,aes(x=Ireland*100,y=desertion_rate*100)) +
  geom_point(size=1) + 
  stat_smooth(method = lm,se=FALSE) + 
  annotate("text", x = 50, y = 54,size=5, label = "R^2 = 0.30,\np < 2e-16") +
  #stat_cor(aes(label = paste(..r.label..,..p.label.., sep = "~`,`~")),label.x=35,label.y = 50,size=5,family = "sans",)+
  theme_minimal() +
  labs(x="Percent of Company that is Irish"
       ,y="Percent of Soldiers Deserted",
       title="Desertion Rate by Percent Irish")+
  theme(
    text = element_text(family = "sans"),
    plot.title = element_text(hjust=.5,size=14),
  )


# scatter plot for desertion rate by German (stats pre-calculated in the 
# linear model above)
comb %>%
  filter(size > 50) %>%
  ggplot(.,aes(x=Germany*100,y=desertion_rate*100)) +
  geom_point(size=1) + 
  stat_smooth(method = lm,se=FALSE) + 
  annotate("text", x = 75, y = 40,size=5, label = "R^2 = 0.092,\np = 0.0015") +
  #stat_cor(aes(label = paste(..r.label..,..p.label.., sep = "~`,`~")),label.x=35,label.y = 50,size=5,family = "sans",)+
  theme_minimal() +
  labs(x="Percent of Company that is German"
       ,y="Percent of Soldiers Deserted",
       title="Desertion Rate by Percent German")+
  theme(
    text = element_text(family = "sans"),
    plot.title = element_text(hjust=.5,size=14),
  )

# scatter plot for court rate by non-american (stats pre-calculated in the 
# linear model above)
comb %>%
  filter(size > 5) %>%
  ggplot(.,aes(x=(1-America)*100,y=court_rate*100)) +
  geom_point(size=1) + 
  stat_smooth(method = lm,se=FALSE) + 
  annotate("text", x = 90, y = 48,size=5, label = "R^2 = 0.23,\np = 2.02e-15") +
  
  #stat_cor(aes(label = paste(..r.label..,..p.label.., sep = "~`,`~")),label.x=60,label.y = 50,size=5,family = "sans",)+
  theme_minimal() +
  labs(x="Percent of Company that is non-American",y="Percent of Soldiers Court-Martialed ",
       title="Court-Martial Rate by Company Diversity")+
  theme(
    text = element_text(family = "sans"),
    plot.title = element_text(hjust=.5,size=14),
  )

# scatter plot for court rate by Irish (stats pre-calculated in the 
# linear model above)
comb %>%
  filter(size > 50) %>%
  ggplot(.,aes(x=Ireland*100,y=court_rate*100)) +
  geom_point(size=1) + 
  stat_smooth(method = lm,se=FALSE) + 
  annotate("text", x = 55, y = 65,size=5, label = "R^2 = 0.31,\np < 2e-16") +
  #stat_cor(aes(label = paste(..r.label..,..p.label.., sep = "~`,`~")),label.x=35,label.y = 50,size=5,family = "sans",)+
  theme_minimal() +
  labs(x="Percent of Company that is Irish"
       ,y="Percent of Soldiers Court-Martialled",
       title="Court-Martial Rate by Percent Irish")+
  theme(
    text = element_text(family = "sans"),
    plot.title = element_text(hjust=.5,size=14),
  )


# scatter plot for court rate by German (stats pre-calculated in the 
# linear model above)
comb %>%
  filter(size > 50) %>%
  ggplot(.,aes(x=Germany*100,y=court_rate*100)) +
  geom_point(size=1) + 
  stat_smooth(method = lm,se=FALSE) + 
  annotate("text", x = 75, y = 44,size=5, label = "R^2 = 0.084,\np = 2.41e-06") +
  #stat_cor(aes(label = paste(..r.label..,..p.label.., sep = "~`,`~")),label.x=35,label.y = 50,size=5,family = "sans",)+
  theme_minimal() +
  labs(x="Percent of Company that is German"
       ,y="Percent of Soldiers Court-Martialled",
       title="Court-Martial Rate by Percent German")+
  theme(
    text = element_text(family = "sans"),
    plot.title = element_text(hjust=.5,size=14),
  )
